Load a sample of the raw JSON data into pandas.

In [107]:
import pandas as pd
json_file = 'sample_data'
list(pd.read_json(json_file, lines=True))


Out[107]:
['archived',
 'author',
 'author_flair_css_class',
 'author_flair_text',
 'body',
 'controversiality',
 'created_utc',
 'distinguished',
 'downs',
 'edited',
 'gilded',
 'id',
 'link_id',
 'name',
 'parent_id',
 'retrieved_on',
 'score',
 'score_hidden',
 'subreddit',
 'subreddit_id',
 'ups']
Transform the full JSON file into a CSV, removing any stuff that we won't need:
  • [deleted] users or comments
  • comments with <10 tokens

(WARNING: this takes ~2.5 hours)


In [ ]:
import csv
import json
from nltk.tokenize import TweetTokenizer
from tqdm import tqdm

MIN_NUM_WORD_TOKENS = 10
TOTAL_NUM_LINES = 53851542  # $ wc -l data_full.json 
PBAR_UPDATE_SIZE = 10000

tokenizer = TweetTokenizer()

def _ok_to_write(entries):
    if entries['author'] == '[deleted]':
        return False
    if entries['body'] == '[deleted]' or len(tokenizer.tokenize(entries['body'])) < MIN_NUM_WORD_TOKENS:
        return False
    return True

out_columns = [
    'author',
    'body',
    'subreddit',
    'subreddit_id',
    'score',
]
in_filename = 'data_full.json'
out_filename = 'data_full_preprocessed.csv'
count = 0
pbar = tqdm(total=TOTAL_NUM_LINES)
with open(out_filename, 'w') as o:
    writer = csv.DictWriter(o, fieldnames=out_columns, extrasaction='ignore', 
                            delimiter=',', quoting=csv.QUOTE_MINIMAL)
    writer.writeheader()
    with open(in_filename, 'r') as f:
        for line in f:
            count += 1
            if count % PBAR_UPDATE_SIZE == 0:
                pbar.update(PBAR_UPDATE_SIZE)
            entries = json.loads(line)
            if _ok_to_write(entries):
                writer.writerow(entries)
print('Done. Processed {} lines total.'.format(count))
Creates CSVs of text from comments made by users who have posted about anorexia or obesity.

In [15]:
import pandas as pd
from tqdm import tqdm
from nltk.corpus import wordnet
from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer

tokenizer = TweetTokenizer()
wordnet_lemmatizer = WordNetLemmatizer()

# Create synonym sets for obesity and anorexia
def syn_set(word_list):
    syns = set()
    for word in word_list:
        for synset in wordnet.synsets(word):
            for lemma in synset.lemmas():
                syns.add(lemma.name())
    return syns    

OBESITY_SYNS = syn_set(['obesity'])
ANOREXIA_SYNS = syn_set(['anorexia'])

def row_filter_fn(df, syns):
    """Returns True if the row should be included, False otherwise."""
    # Check if any synonyms can be found.
    if set([wordnet_lemmatizer.lemmatize(token.lower()) for token in tokenizer.tokenize(df)]) & syns:
        return True
    
    return False

csv_filename = 'data_full_preprocessed.csv'
chunksize = 10000
count = 0
obesity_data_frames = []
anorexia_data_frames = []
for chunk in tqdm(pd.read_csv(csv_filename, chunksize=chunksize)):
    obesity_df = chunk[chunk['body'].apply(row_filter_fn, syns=OBESITY_SYNS)]
    if not obesity_df.empty:
        obesity_data_frames.append(obesity_df)
    anorexia_df = chunk[chunk['body'].apply(row_filter_fn, syns=ANOREXIA_SYNS)]
    if not anorexia_df.empty:
        anorexia_data_frames.append(anorexia_df)
    count += 1
    #if count == 100: break
print('Total # chunks processed: {}.'.format(count))

# Write out to CSVs.
pd.concat(obesity_data_frames).to_csv('obesity.csv', index=False)
pd.concat(anorexia_data_frames).to_csv('anorexia.csv', index=False)


1it [00:09,  9.45s/it]
3636it [11:22:37,  7.13s/it]
Total # chunks processed: 3636.